library(readr)
library(ggplot2)
library(e1071)
library(readxl)
library(gridExtra)
library(tidyverse)
library(countrycode)
library(sf)
library(tmap)
library(lintr)
library(plotly)
library(MASS) # for boxcox normalisation
library(rmapshaper)
library(mapview)
library(leaflet)
library(RColorBrewer)
library(DT)
library(GGally)
library(dplyr)
library(psych)
library(knitr)
library(factoextra)
library(GGally)
library(data.table)
library(rpart)
library(rpart.plot)
library(rattle)
library(caret)csc3007-cyan2
1 Importing of libraries
2 Selecting the indicators
#mortality_rate_attributed_to_unsafe_water
mortality_rate_unsafe_water <- read_csv("data/indicator_3.9.2.csv",show_col_types = FALSE)
#Proportion_of_population_using_safely_managed_drinking_water_services
proportion_of_safe_water<- read_csv("data/indicator_6.1.1.csv",show_col_types = FALSE)
#mortality_rate_unintentional_poisoning
mortality_rate_unintentional_poisoning <- read_csv("data/indicator_3.9.3.csv",show_col_types = FALSE)
#proportion-of-population-with-basic-handwashing-facilities-on-premises-by-urban-rural-percent
population_with_basic_handwashing_facilities<- read_csv("data/indicator_6.2.1.csv",show_col_types = FALSE)3 Map of Indicator 1
3.1 Select specific columns for mortality rate unintentional poisoning
mortality_rate_unintentional_poisoning <- dplyr::select(
mortality_rate_unintentional_poisoning,
code = 'geoAreaCode',
country = 'geoAreaName',
region = 'parentName',
iso = 'ISO3',
gender = 'sex_desc',
mortality_rate_unintentional_poisoning_2019 = 'value_2019'
)
# drop rows if contains at least one NA value
mortality_rate_unintentional_poisoning <- na.omit(mortality_rate_unintentional_poisoning)
#check for empty row
any(is.na(mortality_rate_unintentional_poisoning))[1] FALSE
mortality_rate_unintentional_poisoning# A tibble: 597 × 6
code country region iso gender mortality_rate_unint…¹
<dbl> <chr> <chr> <chr> <chr> <dbl>
1 470 Malta Southern Europe MLT Both … 0.1
2 152 Chile South America CHL Male 0.5
3 4 Afghanistan Southern Asia (excludi… AFG Both … 1
4 470 Malta Southern Europe MLT Female 0
5 156 China Eastern Asia CHN Both … 1.8
6 4 Afghanistan Southern Asia (excludi… AFG Both … 1
7 470 Malta Southern Europe MLT Male 0.2
8 156 China Eastern Asia CHN Female 1.5
9 478 Mauritania Western Africa MRT Both … 1.5
10 478 Mauritania Western Africa MRT Female 1.3
# ℹ 587 more rows
# ℹ abbreviated name: ¹mortality_rate_unintentional_poisoning_2019
3.2 Histogram of mortality rate unintential poisoning for year 2019
# Create the histogram
mortality_histo <- ggplot(mortality_rate_unintentional_poisoning, aes(x=mortality_rate_unintentional_poisoning_2019)) +
geom_histogram(binwidth=0.1, fill='blue', color='black') +
labs(title="Mortality rate unintentional poisoning for Year 2019", x="Mortality Rate (per 100,000)", y="Number of Country")
mortality_histo
3.3 Select specific columns for population with basic handwashing facilities
population_with_basic_handwashing_facilities <- dplyr::select(
population_with_basic_handwashing_facilities,
code = 'geoAreaCode',
country = 'geoAreaName',
region = 'parentName',
iso = 'ISO3',
area = 'location_desc',
pop_with_basic_handwashing_facilites_2019 = 'value_2019'
)
# drop rows if contains at least one NA value
population_with_basic_handwashing_facilities <- na.omit(population_with_basic_handwashing_facilities)
#check for empty row
any(is.na(mortality_rate_unintentional_poisoning))[1] FALSE
population_with_basic_handwashing_facilities# A tibble: 284 × 6
code country region iso area pop_with_basic_handw…¹
<dbl> <chr> <chr> <chr> <chr> <dbl>
1 4 Afghanistan Southern Asia (excludi… AFG All … 38
2 356 India Southern Asia IND Urban 82
3 4 Afghanistan Southern Asia (excludi… AFG All … 38
4 360 Indonesia South-Eastern Asia IDN All … 94
5 4 Afghanistan Southern Asia (excludi… AFG Rural 29
6 694 Sierra Leone Western Africa SLE Rural 18
7 360 Indonesia South-Eastern Asia IDN Rural 91
8 4 Afghanistan Southern Asia (excludi… AFG Rural 29
9 4 Afghanistan Southern Asia (excludi… AFG Urban 64
10 4 Afghanistan Southern Asia (excludi… AFG Urban 64
# ℹ 274 more rows
# ℹ abbreviated name: ¹pop_with_basic_handwashing_facilites_2019
3.4 Histogram of population with basic handwashing facilities for year 2019
# Create the histogram
pop_with_handwashing_facilities_histo <- ggplot(population_with_basic_handwashing_facilities, aes(x=pop_with_basic_handwashing_facilites_2019)) +
geom_histogram(binwidth=5, fill='blue', color='black') +
labs(title="Population with basic handwashing facilities for Year 2019", x="Population (%)", y="Number of Country")
pop_with_handwashing_facilities_histo
3.5 Left join between mortality rate and basic handwashing
countries_tb <- left_join(
mortality_rate_unintentional_poisoning,
population_with_basic_handwashing_facilities,
by = join_by(country, iso)
) |>
mutate(
iso = case_match(
iso,
"COD" ~ "ZAR",
"ROU" ~ "ROM",
"TLS" ~ "TMP",
"XKX" ~ "KSV",
.default = iso
)
)Warning in left_join(mortality_rate_unintentional_poisoning, population_with_basic_handwashing_facilities, : Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 3 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
"many-to-many"` to silence this warning.
countries_tb# A tibble: 1,323 × 10
code.x country region.x iso gender mortality_rate_unint…¹ code.y region.y
<dbl> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr>
1 470 Malta Souther… MLT Both … 0.1 NA <NA>
2 152 Chile South A… CHL Male 0.5 NA <NA>
3 4 Afghanis… Souther… AFG Both … 1 4 Souther…
4 4 Afghanis… Souther… AFG Both … 1 4 Souther…
5 4 Afghanis… Souther… AFG Both … 1 4 Souther…
6 4 Afghanis… Souther… AFG Both … 1 4 Souther…
7 4 Afghanis… Souther… AFG Both … 1 4 Souther…
8 4 Afghanis… Souther… AFG Both … 1 4 Souther…
9 470 Malta Souther… MLT Female 0 NA <NA>
10 156 China Eastern… CHN Both … 1.8 NA <NA>
# ℹ 1,313 more rows
# ℹ abbreviated name: ¹mortality_rate_unintentional_poisoning_2019
# ℹ 2 more variables: area <chr>,
# pop_with_basic_handwashing_facilites_2019 <dbl>
3.6 Import map
countries_sf <- read_sf("./data/WB_countries_Admin0.geojson")
land_sf <- read_sf("./data/WB_Land.geojson")countries_sf <-
countries_sf |>
ms_simplify() |> # Default argument `keep = 0.05`
st_make_valid()
land_sf <- ms_simplify(land_sf)
npts(countries_sf)[1] 30826
countries_sf <-
countries_sf |>
left_join(
countries_tb,
by = join_by(WB_A3 == iso)
) |>
dplyr::select(country, continent = 'CONTINENT', gender = 'gender', area = 'area', iso = WB_A3, mortality_rate_unintentional_poisoning_2019, pop_with_basic_handwashing_facilites_2019)
countries_sfSimple feature collection with 1246 features and 7 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: -180 ymin: -55.62754 xmax: 179.9038 ymax: 83.6341
Geodetic CRS: WGS 84
# A tibble: 1,246 × 8
country continent gender area iso mortality_rate_unintentional…¹
<chr> <chr> <chr> <chr> <chr> <dbl>
1 Indonesia Asia Both sexes All areas IDN 0.3
2 Indonesia Asia Both sexes Rural IDN 0.3
3 Indonesia Asia Both sexes Urban IDN 0.3
4 Indonesia Asia Female All areas IDN 0.1
5 Indonesia Asia Female Rural IDN 0.1
6 Indonesia Asia Female Urban IDN 0.1
7 Indonesia Asia Male All areas IDN 0.5
8 Indonesia Asia Male Rural IDN 0.5
9 Indonesia Asia Male Urban IDN 0.5
10 Malaysia Asia Both sexes <NA> MYS 0.7
# ℹ 1,236 more rows
# ℹ abbreviated name: ¹mortality_rate_unintentional_poisoning_2019
# ℹ 2 more variables: pop_with_basic_handwashing_facilites_2019 <dbl>,
# geometry <MULTIPOLYGON [°]>
polygon <- st_polygon(x = list(rbind(
c(-180.0001, 90),
c(-179.9999, 90),
c(-179.9999, -90),
c(-180.0001, -90),
c(-180.0001, 90)
))) |>
st_sfc() |>
st_set_crs(4326) # Equirectangular projection
countries_sf <- mutate(
countries_sf,
geometry = st_difference(geometry, polygon)
) |>
st_make_valid()
tm_shape(countries_sf) + tm_polygons()
3.7 Map of population with basic handwashing facilities for year 2019
countries_sf$area <- replace_na(countries_sf$area, "missing")
choropleth_pop <-
tm_shape(land_sf, projection = "ESRI:54012") +
tm_polygons(col = "grey") +
tm_shape(countries_sf) +
tm_polygons(
col = "pop_with_basic_handwashing_facilites_2019",
border.col = "grey30",
palette = "Greens",
breaks = c(-Inf, 20, 40, 60, 80, 100, Inf),
colorNA = "grey",
title = "Population (%)",
lwd = 0.5
) +
tm_text("iso", size = "AREA") + # Use the new column to set the size
tm_credits(
c("", "Source: unstats-undesa.opendata.arcgis.com"),
position = c("right", "bottom")
) +
tm_layout(
bg.color = "lightblue",
frame = FALSE,
inner.margins = c(0.00, 0.2, 0.2, 0.1),
earth.boundary = TRUE,
space.color = "white",
main.title.size = 0.8,
title.size = 0.5,
main.title = "Population with basic handwashing facilities by Country for 2019"
)
choropleth_pop
3.8 Map of mortality rate unintentional poisoning for year 2019
countries_sf$gender <- replace_na(countries_sf$gender, "missing")
choropleth_mor<-
tm_shape(land_sf, projection = "ESRI:54012") +
tm_polygons(col = "grey") +
tm_shape(countries_sf) +
tm_polygons(
col = "mortality_rate_unintentional_poisoning_2019",
border.col = "grey30",
palette = "Oranges",
breaks = c(-Inf, 0.2, 0.4, 0.6, 0.8, 1, Inf),
colorNA = "grey",
title = "Mortality Rate \nper 100,000",
lwd = 0.5
) +
tm_text("iso", size = "AREA") +
tm_credits(
c("", "Source: unstats-undesa.opendata.arcgis.com"),
position = c("right", "bottom")
) +
tm_layout(
bg.color = "lightblue",
frame = FALSE,
inner.margins = c(0.00, 0.2, 0.2, 0.1),
earth.boundary = TRUE,
space.color = "white",
main.title.size = 0.8,
title.size = 0.5,
main.title = "Mortality Rate Unintentional Poisoning by Country for 2019"
)
choropleth_mor
3.9 Filtering of areas
countries_sf_all_areas <- countries_sf[countries_sf$area == "All areas",]
countries_sf_urban <- countries_sf[countries_sf$area == "Urban",]
countries_sf_rural <- countries_sf[countries_sf$area == "Rural",]
countries_sf_all_areasSimple feature collection with 324 features and 7 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: -172.7497 ymin: -34.80869 xmax: 166.1479 ymax: 55.38515
Geodetic CRS: WGS 84
# A tibble: 324 × 8
country continent gender area iso mortality_rate_unint…¹
<chr> <chr> <chr> <chr> <chr> <dbl>
1 Indonesia Asia Both … All … IDN 0.3
2 Indonesia Asia Female All … IDN 0.1
3 Indonesia Asia Male All … IDN 0.5
4 Bolivia (Plurinational S… South Am… Both … All … BOL 0.6
5 Bolivia (Plurinational S… South Am… Female All … BOL 0.5
6 Bolivia (Plurinational S… South Am… Male All … BOL 0.6
7 India Asia Both … All … IND 0.3
8 India Asia Female All … IND 0.2
9 India Asia Male All … IND 0.3
10 Ethiopia Africa Both … All … ETH 3.3
# ℹ 314 more rows
# ℹ abbreviated name: ¹mortality_rate_unintentional_poisoning_2019
# ℹ 2 more variables: pop_with_basic_handwashing_facilites_2019 <dbl>,
# geometry <MULTIPOLYGON [°]>
countries_sf_urbanSimple feature collection with 306 features and 7 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: -157.449 ymin: -34.80869 xmax: 166.1479 ymax: 55.38515
Geodetic CRS: WGS 84
# A tibble: 306 × 8
country continent gender area iso mortality_rate_unint…¹
<chr> <chr> <chr> <chr> <chr> <dbl>
1 Indonesia Asia Both … Urban IDN 0.3
2 Indonesia Asia Female Urban IDN 0.1
3 Indonesia Asia Male Urban IDN 0.5
4 Bolivia (Plurinational S… South Am… Both … Urban BOL 0.6
5 Bolivia (Plurinational S… South Am… Female Urban BOL 0.5
6 Bolivia (Plurinational S… South Am… Male Urban BOL 0.6
7 India Asia Both … Urban IND 0.3
8 India Asia Female Urban IND 0.2
9 India Asia Male Urban IND 0.3
10 Ethiopia Africa Both … Urban ETH 3.3
# ℹ 296 more rows
# ℹ abbreviated name: ¹mortality_rate_unintentional_poisoning_2019
# ℹ 2 more variables: pop_with_basic_handwashing_facilites_2019 <dbl>,
# geometry <MULTIPOLYGON [°]>
countries_sf_ruralSimple feature collection with 309 features and 7 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: -157.449 ymin: -34.80869 xmax: 166.1479 ymax: 55.38515
Geodetic CRS: WGS 84
# A tibble: 309 × 8
country continent gender area iso mortality_rate_unint…¹
<chr> <chr> <chr> <chr> <chr> <dbl>
1 Indonesia Asia Both … Rural IDN 0.3
2 Indonesia Asia Female Rural IDN 0.1
3 Indonesia Asia Male Rural IDN 0.5
4 Bolivia (Plurinational S… South Am… Both … Rural BOL 0.6
5 Bolivia (Plurinational S… South Am… Female Rural BOL 0.5
6 Bolivia (Plurinational S… South Am… Male Rural BOL 0.6
7 Peru South Am… Both … Rural PER 0.4
8 Peru South Am… Female Rural PER 0.3
9 Peru South Am… Male Rural PER 0.5
10 India Asia Both … Rural IND 0.3
# ℹ 299 more rows
# ℹ abbreviated name: ¹mortality_rate_unintentional_poisoning_2019
# ℹ 2 more variables: pop_with_basic_handwashing_facilites_2019 <dbl>,
# geometry <MULTIPOLYGON [°]>
# Get a color palette with three colors
colors <- brewer.pal(3, "Set1")
# Assign colors to the categories
colorpal_area <- colorFactor(colors,
levels = c("All areas", "Urban", "Rural"))3.10 Map of population with basic handwashing facilities for all areas
leaflet(countries_sf_all_areas) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(fillColor = ~colorpal_area(area),
fillOpacity = 0.8,
weight = 2,
opacity = 1,
color = "white",
dashArray = "3",
highlight = highlightOptions(
weight = 5,
color = "#666",
dashArray = "",
fillOpacity = 0.7,
bringToFront = TRUE),
label = ~iso,
popup = ~paste("Country Code: ", iso,
"<br>Country: ", country,
"<br>Population with basic handwashing facilities (%): ", pop_with_basic_handwashing_facilites_2019,
"<br>Area: ", area)) %>%
addLegend(pal = colorpal_area,
values = ~area,
title = "Area",
position = "bottomright")3.11 Map of population with basic handwashing facilities for urban areas
leaflet(countries_sf_urban) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(fillColor = ~colorpal_area(area), # use colorpal_area here
fillOpacity = 0.8,
weight = 2,
opacity = 1,
color = "white",
dashArray = "3",
highlight = highlightOptions(
weight = 5,
color = "#666",
dashArray = "",
fillOpacity = 0.7,
bringToFront = TRUE),
label = ~iso, # This adds labels that appear when you hover over a country
popup = ~paste("Country Code: ", iso,
"<br>Country: ", country,
"<br>Population with basic handwashing facilities (%): ", pop_with_basic_handwashing_facilites_2019,
"<br>Area: ", area)) %>%
addLegend(pal = colorpal_area,
values = ~area,
title = "Area",
position = "bottomright")3.12 Map of population with basic handwashing facilities for rural areas
leaflet(countries_sf_rural) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(fillColor = ~colorpal_area(area), # use colorpal_area here
fillOpacity = 0.8,
weight = 2,
opacity = 1,
color = "white",
dashArray = "3",
highlight = highlightOptions(
weight = 5,
color = "#666",
dashArray = "",
fillOpacity = 0.7,
bringToFront = TRUE),
label = ~iso, # This adds labels that appear when you hover over a country
popup = ~paste("Country Code: ", iso,
"<br>Country: ", country,
"<br>Population with basic handwashing facilities (%): ", pop_with_basic_handwashing_facilites_2019,
"<br>Area: ", area)) %>%
addLegend(pal = colorpal_area,
values = ~area,
title = "Area",
position = "bottomright")3.13 Filtering of genders
countries_sf_male <- countries_sf[countries_sf$gender == "Male",]
countries_sf_female <- countries_sf[countries_sf$gender == "Female",]
countries_sf_both <- countries_sf[countries_sf$gender == "Both sexes",]
countries_sf_maleSimple feature collection with 410 features and 7 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: -179.9999 ymin: -55.62754 xmax: 179.9999 ymax: 83.11652
Geodetic CRS: WGS 84
# A tibble: 410 × 8
country continent gender area iso mortality_rate_unint…¹
<chr> <chr> <chr> <chr> <chr> <dbl>
1 Indonesia Asia Male All … IDN 0.5
2 Indonesia Asia Male Rural IDN 0.5
3 Indonesia Asia Male Urban IDN 0.5
4 Malaysia Asia Male miss… MYS 1
5 Chile South Am… Male miss… CHL 0.5
6 Bolivia (Plurinational S… South Am… Male All … BOL 0.6
7 Bolivia (Plurinational S… South Am… Male Rural BOL 0.6
8 Bolivia (Plurinational S… South Am… Male Urban BOL 0.6
9 Peru South Am… Male Rural PER 0.5
10 Argentina South Am… Male miss… ARG 0.5
# ℹ 400 more rows
# ℹ abbreviated name: ¹mortality_rate_unintentional_poisoning_2019
# ℹ 2 more variables: pop_with_basic_handwashing_facilites_2019 <dbl>,
# geometry <MULTIPOLYGON [°]>
countries_sf_femaleSimple feature collection with 410 features and 7 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: -179.9999 ymin: -55.62754 xmax: 179.9999 ymax: 83.11652
Geodetic CRS: WGS 84
# A tibble: 410 × 8
country continent gender area iso mortality_rate_unint…¹
<chr> <chr> <chr> <chr> <chr> <dbl>
1 Indonesia Asia Female All … IDN 0.1
2 Indonesia Asia Female Rural IDN 0.1
3 Indonesia Asia Female Urban IDN 0.1
4 Malaysia Asia Female miss… MYS 0.3
5 Chile South Am… Female miss… CHL 0.2
6 Bolivia (Plurinational S… South Am… Female All … BOL 0.5
7 Bolivia (Plurinational S… South Am… Female Rural BOL 0.5
8 Bolivia (Plurinational S… South Am… Female Urban BOL 0.5
9 Peru South Am… Female Rural PER 0.3
10 Argentina South Am… Female miss… ARG 0.3
# ℹ 400 more rows
# ℹ abbreviated name: ¹mortality_rate_unintentional_poisoning_2019
# ℹ 2 more variables: pop_with_basic_handwashing_facilites_2019 <dbl>,
# geometry <MULTIPOLYGON [°]>
countries_sf_bothSimple feature collection with 410 features and 7 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: -179.9999 ymin: -55.62754 xmax: 179.9999 ymax: 83.11652
Geodetic CRS: WGS 84
# A tibble: 410 × 8
country continent gender area iso mortality_rate_unint…¹
<chr> <chr> <chr> <chr> <chr> <dbl>
1 Indonesia Asia Both … All … IDN 0.3
2 Indonesia Asia Both … Rural IDN 0.3
3 Indonesia Asia Both … Urban IDN 0.3
4 Malaysia Asia Both … miss… MYS 0.7
5 Chile South Am… Both … miss… CHL 0.4
6 Bolivia (Plurinational S… South Am… Both … All … BOL 0.6
7 Bolivia (Plurinational S… South Am… Both … Rural BOL 0.6
8 Bolivia (Plurinational S… South Am… Both … Urban BOL 0.6
9 Peru South Am… Both … Rural PER 0.4
10 Argentina South Am… Both … miss… ARG 0.4
# ℹ 400 more rows
# ℹ abbreviated name: ¹mortality_rate_unintentional_poisoning_2019
# ℹ 2 more variables: pop_with_basic_handwashing_facilites_2019 <dbl>,
# geometry <MULTIPOLYGON [°]>
# Get a color palette with three colors
colors <- brewer.pal(3, "Set1")
# Assign colors to the categories
colorpal_gender <- colorFactor(colors,
levels = c("Female", "Male", "Both sexes"))3.14 Map of mortality rate unintentional poisoning for male gender
# Replace NAs in 'gender' column
countries_sf_male$gender <- replace_na(countries_sf_male$gender, "missing")
leaflet(countries_sf_male) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(fillColor = ~colorpal_gender(gender),
fillOpacity = 0.8,
weight = 2,
opacity = 1,
color = "white",
dashArray = "3",
highlight = highlightOptions(
weight = 5,
color = "#666",
dashArray = "",
fillOpacity = 0.7,
bringToFront = TRUE),
label = ~iso,
popup = ~paste("Country Code: ", iso,
"<br>Country: ", country,
"<br>Mortality Rate Unintentional Poisoning per 100,000: ", mortality_rate_unintentional_poisoning_2019,
"<br>Gender: ", gender)) %>%
addLegend(pal = colorpal_gender,
values = ~gender,
title = "Gender",
position = "bottomright")3.15 Map of mortality rate unintentional poisoning for female gender
# Replace NAs in 'gender' column
countries_sf_female$gender <- replace_na(countries_sf_female$gender, "missing")
leaflet(countries_sf_female) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(fillColor = ~colorpal_gender(gender),
fillOpacity = 0.8,
weight = 2,
opacity = 1,
color = "white",
dashArray = "3",
highlight = highlightOptions(
weight = 5,
color = "#666",
dashArray = "",
fillOpacity = 0.7,
bringToFront = TRUE),
label = ~iso,
popup = ~paste("Country Code: ", iso,
"<br>Country: ", country,
"<br>Mortality Rate Unintentional Poisoning per 100,000: ", mortality_rate_unintentional_poisoning_2019,
"<br>Gender: ", gender)) %>%
addLegend(pal = colorpal_gender,
values = ~gender,
title = "Gender",
position = "bottomright")3.16 Map of mortality rate unintentional poisoning for both gender
# Replace NAs in 'gender' column
countries_sf_both$gender <- replace_na(countries_sf_both$gender, "missing")
leaflet(countries_sf_both) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(fillColor = ~colorpal_gender(gender), # Assign color based on gender
fillOpacity = 0.8,
weight = 2,
opacity = 1,
color = "white",
dashArray = "3",
highlight = highlightOptions(
weight = 5,
color = "#666",
dashArray = "",
fillOpacity = 0.7,
bringToFront = TRUE),
label = ~iso,
popup = ~paste("Country Code: ", iso,
"<br>Country: ", country,
"<br>Mortality Rate Unintentional Poisoning per 100,000: ", mortality_rate_unintentional_poisoning_2019,
"<br>Gender: ", gender)) %>%
addLegend(pal = colorpal_gender,
values = ~gender,
title = "Gender",
position = "bottomright")4 Plots of Indicator between mortality rate and population with basic handwashing facilities
countries_sf <- countries_sf[!is.na(countries_sf$mortality_rate_unintentional_poisoning_2019) & !is.na(countries_sf$pop_with_basic_handwashing_facilites_2019), ]
countries_sf <- countries_sf %>%
filter(gender == 'Both sexes')
countries_sfSimple feature collection with 313 features and 7 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: -172.7497 ymin: -34.80869 xmax: 166.1479 ymax: 55.38515
Geodetic CRS: WGS 84
# A tibble: 313 × 8
country continent gender area iso mortality_rate_unint…¹
* <chr> <chr> <chr> <chr> <chr> <dbl>
1 Indonesia Asia Both … All … IDN 0.3
2 Indonesia Asia Both … Rural IDN 0.3
3 Indonesia Asia Both … Urban IDN 0.3
4 Bolivia (Plurinational S… South Am… Both … All … BOL 0.6
5 Bolivia (Plurinational S… South Am… Both … Rural BOL 0.6
6 Bolivia (Plurinational S… South Am… Both … Urban BOL 0.6
7 Peru South Am… Both … Rural PER 0.4
8 India Asia Both … Urban IND 0.3
9 India Asia Both … All … IND 0.3
10 India Asia Both … Rural IND 0.3
# ℹ 303 more rows
# ℹ abbreviated name: ¹mortality_rate_unintentional_poisoning_2019
# ℹ 2 more variables: pop_with_basic_handwashing_facilites_2019 <dbl>,
# geometry <MULTIPOLYGON [°]>
4.1 Correlation
# Plot the data
plot_loess <- ggplot(countries_sf, aes(x = mortality_rate_unintentional_poisoning_2019, y = pop_with_basic_handwashing_facilites_2019)) +
geom_point(aes(color = area,
shape = area,
text = paste("Country:", country, "<br>", "Location:", area)), alpha = 0.7) +
geom_smooth(data = subset(countries_sf, area == "All areas"),
method = loess, se = FALSE,
aes(color = area, weight = pop_with_basic_handwashing_facilites_2019),
show.legend = TRUE) +
geom_smooth(data = subset(countries_sf, area == "Urban"),
method = loess, se = FALSE,
aes(color = area, weight = pop_with_basic_handwashing_facilites_2019),
show.legend = TRUE) +
geom_smooth(data = subset(countries_sf, area == "Rural"),
method = loess, se = FALSE,
aes(color = area, weight = pop_with_basic_handwashing_facilites_2019),
show.legend = TRUE) +
labs(x = "Mortality Rate (deaths per 100,000 population) for 2019 ",
y = "Population with Basic Handwashing Facilities for 2019 (%)",
title = "Correlation between Mortality Rate and Basic Handwashing Facilities \n for Both Gender",
color = "Location Type",
shape = "Location Type") +
theme_minimal() +
guides(color = guide_legend(override.aes = list(shape = 16)),
shape = guide_legend(override.aes = list(color = "black")))Warning in geom_point(aes(color = area, shape = area, text = paste("Country:",
: Ignoring unknown aesthetics: text
# Convert to a plotly plot
gp_map_1 <- ggplotly(plot_loess, tooltip = "text")`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
gp_map_1correlation_all <- cor(countries_sf$pop_with_basic_handwashing_facilites_2019[countries_sf$area == "All areas"],
countries_sf$mortality_rate_unintentional_poisoning_2019[countries_sf$area == "All areas"])
correlation_urban <- cor(countries_sf$pop_with_basic_handwashing_facilites_2019[countries_sf$area == "Urban"],
countries_sf$mortality_rate_unintentional_poisoning_2019[countries_sf$area == "Urban"])
correlation_rural <- cor(countries_sf$pop_with_basic_handwashing_facilites_2019[countries_sf$area == "Rural"],
countries_sf$mortality_rate_unintentional_poisoning_2019[countries_sf$area == "Rural"])
blank_plot <- ggplot() +
theme_void() +
theme(plot.margin = margin(1, 1, 1, 1, "cm")) +
annotate("text",
x = 0.5,
y = 0.5,
label = paste("Correlation All:",
round(correlation_all, 2), "\n",
"Correlation Urban:",
round(correlation_urban, 2), "\n",
"Correlation Rural:",
round(correlation_rural, 2)),
size = 5,
color = "black",
hjust = 0.5)
# Convert to a plotly plot
gp_text_map_1 <- ggplotly(blank_plot)
gp_text_map_1stacked_bar_chart <- ggplot(countries_sf, aes(x = mortality_rate_unintentional_poisoning_2019, fill = area)) +
geom_bar(position = "stack") +
labs(x = "Mortality Rate (deaths per 100,000 population)",
y = "Count",
title = "Stacked Bar Chart of Mortality Rates") +
theme_minimal()
# Convert to a plotly plot
ggplot_stacked_bar_chart <- ggplotly(stacked_bar_chart)
# Create a grouped bar chart
grouped_bar_chart <- ggplot(countries_sf, aes(x = mortality_rate_unintentional_poisoning_2019, fill = area)) +
geom_bar(position = "dodge") +
labs(x = "Mortality Rate (deaths per 100,000 population)",
y = "Count",
title = "Grouped Bar Chart of Mortality Rates") +
theme_minimal()
# Convert to a plotly plot
ggplot_grouped_bar_chart <- ggplotly(grouped_bar_chart)4.2 Barchart and stacked barchart (without Transformation)
ggplot_stacked_bar_chart <- ggplotly(stacked_bar_chart, width = 1000, height = 600, autosize = TRUE)
ggplot_grouped_bar_chart <- ggplotly(grouped_bar_chart, width = 1000, height = 600, autosize = TRUE)
ggplot_stacked_bar_chartggplot_grouped_bar_chart4.3 Normalise Data
#Ensure you have the MASS library installed
countries_sf$mortality_rate_unintentional_poisoning_2019.positive <- countries_sf$mortality_rate_unintentional_poisoning_2019 + abs(min(countries_sf$mortality_rate_unintentional_poisoning_2019)) + 0.1
# Apply the Box-Cox transformation
bc_result <- MASS::boxcox(countries_sf$mortality_rate_unintentional_poisoning_2019.positive ~ 1,
lambda = seq(-3,3,0.1))
# The optimal lambda value is the one that maximizes the log-likelihood
optimal_lambda <- bc_result$x[which.max(bc_result$y)]
# Transform the data using the optimal lambda value
if (optimal_lambda == 0) {
countries_sf$latest_value.x_bc <- log(countries_sf$mortality_rate_unintentional_poisoning_2019.positive)
} else {
countries_sf$latest_value.x_bc <- (countries_sf$mortality_rate_unintentional_poisoning_2019.positive^optimal_lambda - 1) / optimal_lambda
}
# Shift the transformed variable to be non-negative
min_value <- min(countries_sf$latest_value.x_bc)
countries_sf$latest_value.x_bc <- countries_sf$latest_value.x_bc + abs(min_value) + 0.14.4 Histogram (Normalised)
# Round the transformed mortality rates to the nearest whole number
countries_sf$latest_value.x_bc_rounded <- round(countries_sf$latest_value.x_bc)
# Convert the latest_value.x_bc_rounded variable into a categorical variable
countries_sf$latest_value.x_bc_cat <- cut(countries_sf$latest_value.x_bc_rounded, breaks = 50)
# Create a stacked bar chart
stacked_bar_chart_bc <- ggplot(countries_sf, aes(x = latest_value.x_bc_rounded, fill = area)) +
geom_bar(position = "stack", bins =20) +
labs(x = "Transformed Mortality Rate",
y = "Count",
title = "Stacked Bar Chart of Transformed Mortality Rates") +
theme_minimal()Warning in geom_bar(position = "stack", bins = 20): Ignoring unknown
parameters: `bins`
ggplot_stacked_bar_chart_bc <- ggplotly(stacked_bar_chart_bc)
ggplot_stacked_bar_chart_bc# Create a grouped bar chart
grouped_bar_chart_bc <- ggplot(countries_sf, aes(x = latest_value.x_bc_rounded, fill = area)) +
geom_bar(position = "dodge") +
labs(x = "Transformed Mortality Rate",
y = "Count",
title = "Grouped Bar Chart of Transformed Mortality Rates") +
theme_minimal()
# Convert to a plotly plot
ggplot_grouped_bar_chart_bc <- ggplotly(grouped_bar_chart_bc)
ggplot_grouped_bar_chart_bc4.5 Scatter plot Normalised
plot_loess <- ggplot(countries_sf, aes(x = latest_value.x_bc, y = pop_with_basic_handwashing_facilites_2019)) +
geom_point(aes(color = area,
shape = area,
text = paste("Country:", country, "<br>", "Location:", area)), alpha = 0.7) +
geom_smooth(data = subset(countries_sf, area == "All areas"),
method = loess, se = FALSE,
aes(color = area, weight = pop_with_basic_handwashing_facilites_2019),
show.legend = TRUE) +
geom_smooth(data = subset(countries_sf, area == "Urban"),
method = loess,
se = FALSE,
aes(color = area, weight = pop_with_basic_handwashing_facilites_2019),
show.legend = TRUE) +
geom_smooth(data = subset(countries_sf, area == "Rural"),
method = loess,
se = FALSE,
aes(color = area, weight = pop_with_basic_handwashing_facilites_2019),
show.legend = TRUE) +
labs(x = "Transformed Mortality Rate (deaths per 100,000 population) for 2019",
y = "Population with Basic Handwashing Facilities for 2019 (%)",
title = "Correlation between Mortality Rate and Basic Handwashing Facilities (Normalised) for Both Gender",
color = "Location Type",
shape = "Location Type") +
theme_minimal() +
guides(color = guide_legend(override.aes = list(shape = 16)),
shape = guide_legend(override.aes = list(color = "black")))Warning in geom_point(aes(color = area, shape = area, text = paste("Country:",
: Ignoring unknown aesthetics: text
gp_normalised_map_1 <- ggplotly(plot_loess, tooltip = "text")`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
gp_normalised_map_14.6 Correlation Normalised
correlation_all <- cor(countries_sf$latest_value.x_bc[countries_sf$area == "All areas"],
countries_sf$pop_with_basic_handwashing_facilites_2019[countries_sf$area == "All areas"])
correlation_urban <- cor(countries_sf$latest_value.x_bc[countries_sf$area == "Urban"],
countries_sf$pop_with_basic_handwashing_facilites_2019[countries_sf$area == "Urban"])
correlation_rural <- cor(countries_sf$latest_value.x_bc[countries_sf$area == "Rural"],
countries_sf$pop_with_basic_handwashing_facilites_2019[countries_sf$area == "Rural"])
blank_plot <- ggplot() +
theme_void() +
theme(plot.margin = margin(1, 1, 1, 1, "cm")) +
annotate("text",
x = 0.5,
y = 0.5,
label = paste("Correlation All:",
round(correlation_all, 2), "\n",
"Correlation Urban:",
round(correlation_urban, 2), "\n",
"Correlation Rural:",
round(correlation_rural, 2)),
size = 5,
color = "black",
hjust = 0.5)
gp_text_normalised_map_1 <- ggplotly(blank_plot)
gp_text_normalised_map_14.7 Combining normal and unormalised plots
combined_plot_map_1 <- subplot(
gp_map_1, gp_text_map_1,
gp_normalised_map_1, gp_text_normalised_map_1,
nrows = 2, margin = 0.05
)
# Print the combined plot
combined_plot_map_15 Map of Indicator 2
6 Retrieve Data
6.1 Mortality Rate due to Unsafe Water
mortality_rate_unsafe_water_map <- dplyr::select(
mortality_rate_unsafe_water,
code = 'geoAreaCode',
country = 'geoAreaName',
region = 'parentName',
iso = 'ISO3',
mortality_rate_unsafe_water = 'latest_value'
)
mortality_rate_unsafe_water_map <- mortality_rate_unsafe_water_map%>%
distinct()
mortality_rate_unsafe_water_map# A tibble: 183 × 5
code country region iso mortality_rate_unsaf…¹
<dbl> <chr> <chr> <chr> <dbl>
1 583 Micronesia (Federated States of) Oceania … FSM 3.6
2 496 Mongolia Eastern … MNG 1.3
3 499 Montenegro Southern… MNE 0
4 504 Morocco Northern… MAR 1.9
5 508 Mozambique Eastern … MOZ 27.6
6 104 Myanmar South-Ea… MMR 12.6
7 516 Namibia Southern… NAM 18.3
8 524 Nepal Southern… NPL 19.8
9 528 Netherlands Western … NLD 0.2
10 554 New Zealand Australi… NZL 0.1
# ℹ 173 more rows
# ℹ abbreviated name: ¹mortality_rate_unsafe_water
6.2 Proportion of safe water management
proportion_of_safe_water_map <- dplyr::select(
proportion_of_safe_water,
code = 'geoAreaCode',
country = 'geoAreaName',
region = 'parentName',
iso = 'ISO3',
urbanisation = "location_desc",
value_2019 = "value_2019"
)
proportion_of_safe_water_map <- proportion_of_safe_water_map%>%
distinct()
proportion_of_safe_water_map# A tibble: 286 × 6
code country region iso urbanisation value_2019
<dbl> <chr> <chr> <chr> <chr> <dbl>
1 807 North Macedonia South… MKD Rural 66
2 804 Ukraine Easte… UKR Urban 89
3 807 North Macedonia South… MKD Urban 85
4 826 United Kingdom of Great Britain a… North… GBR All areas 100
5 580 Northern Mariana Islands Ocean… MNP All areas 91
6 840 United States of America North… USA All areas 97
7 578 Norway North… NOR All areas 99
8 512 Oman Weste… OMN All areas 90
9 586 Pakistan South… PAK All areas 36
10 586 Pakistan South… PAK Rural 33
# ℹ 276 more rows
7 Cleaning and fixing geometry data
countries_geom <- read_sf("./data/WB_countries_Admin0.geojson")
countries_boundary <- st_boundary(countries_geom)
countries_sf_simplified <- ms_simplify(countries_boundary, keep = 0.05)
countries_sf_valid <- st_make_valid(countries_sf_simplified)
countries_wrapped <- st_wrap_dateline(countries_sf_valid)
polygon <- st_polygon(x = list(rbind(c(-180.0001, 90),
c(-179.9999, 90),
c(-179.9999, -90),
c(-180.0001, -90),
c(-180.0001, 90)))) |>
st_sfc() |>
st_set_crs(4326)
countries_wrapped <- countries_wrapped |>
st_difference(polygon)Warning: attribute variables are assumed to be spatially constant throughout
all geometries
land_sf <- read_sf("data/WB_Land.geojson")
land_boundary <- st_boundary(land_sf)
land_sf_simplified <- ms_simplify(land_boundary, keep = 0.05)
land_sf_valid <- st_make_valid(land_sf_simplified)
land_wrapped <- st_wrap_dateline(land_sf_valid)
polygon <- st_polygon(x = list(rbind(c(-180.0001, 90),
c(-179.9999, 90),
c(-179.9999, -90),
c(-180.0001, -90),
c(-180.0001, 90)))) |>
st_sfc() |>
st_set_crs(4326)
land_wrapped <- land_wrapped |>
st_difference(polygon)Warning: attribute variables are assumed to be spatially constant throughout
all geometries
countries_wrapped <- countries_wrapped |>
st_difference(polygon)Warning: attribute variables are assumed to be spatially constant throughout
all geometries
# Washing data combined
countries_sf_water <- left_join(
countries_wrapped,
proportion_of_safe_water_map,
by = c("WB_A3" = "iso")
)Warning in sf_column %in% names(g): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 3 of `x` matches multiple rows in `y`.
ℹ Row 6 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
"many-to-many"` to silence this warning.
countries_sf_waterSimple feature collection with 325 features and 57 fields
Geometry type: MULTILINESTRING
Dimension: XY
Bounding box: xmin: -179.9999 ymin: -55.62754 xmax: 179.9999 ymax: 83.6341
Geodetic CRS: WGS 84
# A tibble: 325 × 58
REGION_WB ISO_N3 ISO_A3_EH NAME_AR ISO_A3 GDP_MD_EST NAME_VI WB_NAME ISO_A2
<chr> <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
1 East Asia … 360 IDN إندوني… IDN 3028000 Indone… Indone… ID
2 East Asia … 458 MYS ماليزيا MYS 863000 Malays… Malays… MY
3 Latin Amer… 152 CHL تشيلي CHL 436100 Chile Chile CL
4 Latin Amer… 152 CHL تشيلي CHL 436100 Chile Chile CL
5 Latin Amer… 068 BOL بوليفيا BOL 78350 Bolivia Bolivia BO
6 Latin Amer… 604 PER بيرو PER 410400 Peru Peru PE
7 Latin Amer… 604 PER بيرو PER 410400 Peru Peru PE
8 Latin Amer… 604 PER بيرو PER 410400 Peru Peru PE
9 Latin Amer… 032 ARG الأرجن… ARG 879400 Argent… Argent… AR
10 Europe & C… 196 CYP قبرص CYP 29260 Cộng h… Cyprus CY
# ℹ 315 more rows
# ℹ 49 more variables: UN_A3 <chr>, FIPS_10_ <chr>, NAME_EN <chr>,
# NAME_EL <chr>, WB_RULES <chr>, OBJECTID <int>, NAME_DE <chr>,
# SUBREGION <chr>, featurecla <chr>, NAME_ZH <chr>, WB_A3 <chr>,
# LASTCENSUS <int>, TYPE <chr>, NAME_PT <chr>, NAME_BN <chr>,
# FORMAL_EN <chr>, POP_YEAR <int>, REGION_UN <chr>, LEVEL <int>,
# POP_RANK <int>, NAME_IT <chr>, FORMAL_FR <chr>, CONTINENT <chr>, …
# Mortality Rate Combined
countries_sf_mortality <- left_join(
countries_wrapped,
mortality_rate_unsafe_water_map,
by = c("WB_A3" = "iso")
)
countries_sf_mortalitySimple feature collection with 197 features and 56 fields
Geometry type: MULTILINESTRING
Dimension: XY
Bounding box: xmin: -179.9999 ymin: -55.62754 xmax: 179.9999 ymax: 83.6341
Geodetic CRS: WGS 84
# A tibble: 197 × 57
REGION_WB ISO_N3 ISO_A3_EH NAME_AR ISO_A3 GDP_MD_EST NAME_VI WB_NAME ISO_A2
<chr> <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
1 East Asia … 360 IDN إندوني… IDN 3028000 Indone… Indone… ID
2 East Asia … 458 MYS ماليزيا MYS 863000 Malays… Malays… MY
3 Latin Amer… 152 CHL تشيلي CHL 436100 Chile Chile CL
4 Latin Amer… 068 BOL بوليفيا BOL 78350 Bolivia Bolivia BO
5 Latin Amer… 604 PER بيرو PER 410400 Peru Peru PE
6 Latin Amer… 032 ARG الأرجن… ARG 879400 Argent… Argent… AR
7 Europe & C… 196 CYP قبرص CYP 29260 Cộng h… Cyprus CY
8 South Asia 356 IND الهند IND 8721000 Ấn Độ India IN
9 East Asia … 156 CHN جمهوري… CHN 21140000 Cộng h… China CN
10 Middle Eas… 376 ISR إسرائيل ISR 297000 Israel Israel IL
# ℹ 187 more rows
# ℹ 48 more variables: UN_A3 <chr>, FIPS_10_ <chr>, NAME_EN <chr>,
# NAME_EL <chr>, WB_RULES <chr>, OBJECTID <int>, NAME_DE <chr>,
# SUBREGION <chr>, featurecla <chr>, NAME_ZH <chr>, WB_A3 <chr>,
# LASTCENSUS <int>, TYPE <chr>, NAME_PT <chr>, NAME_BN <chr>,
# FORMAL_EN <chr>, POP_YEAR <int>, REGION_UN <chr>, LEVEL <int>,
# POP_RANK <int>, NAME_IT <chr>, FORMAL_FR <chr>, CONTINENT <chr>, …
8 Creating the Maps
8.1 Get the different values for all areas, urban and rural in water management
countries_sf_water_all <- countries_sf_water %>% filter(!(urbanisation %in% c("Rural","Urban")))
countries_sf_water_urban <- countries_sf_water %>% filter(!(urbanisation %in% c("All area","Urban")))
countries_sf_water_rural <- countries_sf_water %>% filter(!(urbanisation %in% c("Urban","All areas")))8.1.1 all areas for water management
# Load the required libraries
library(leaflet)
# Define the color palette function
colorPalette <- colorRampPalette(c("#FF6969", "#00A0D6"))
# Generate the color palette based on the number of breaks
numBreaks <- 6
colors <- colorPalette(numBreaks)
# Create a Leaflet map
all_map <- leaflet(countries_sf_water_all) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(fillColor = ~colorNumeric(palette=colors, domain = c(0, 20, 40, 60, 80, 100))(value_2019),
fillOpacity = 0.8,
weight = 2,
opacity = 1,
color = "white",
dashArray = "3",
highlight = highlightOptions(
weight = 5,
color = "#666",
dashArray = "",
fillOpacity = 0.7,
bringToFront = TRUE),
label = ~WB_A3,
popup = ~paste("Country Code: ", WB_A3,
"<br>Country: ", country,
"<br>Proportion of population access to safely managed drinking water service: ", value_2019))%>%
addLegend(
position = "bottomright",
pal = colorNumeric(palette = colors, domain = c(0, 100)),
values = c(0, 20, 40, 60, 80, 100),
title = "Proportion (%)",
labels = c("0-20", "20-40", "40-60", "60-80", "80-100"),
opacity = 1
)
all_map8.1.2 Urban areas
# Define the color palette function
colorPalette <- colorRampPalette(c("#FF6969", "#00A0D6"))
# Generate the color palette based on the number of breaks
numBreaks <- 6
colors <- colorPalette(numBreaks)
# Create a Leaflet map
urban_map <- leaflet(countries_sf_water_urban) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(fillColor = ~colorNumeric(palette=colors, domain = c(0, 20, 40, 60, 80, 100))(value_2019),
fillOpacity = 0.8,
weight = 2,
opacity = 1,
color = "white",
dashArray = "3",
highlight = highlightOptions(
weight = 5,
color = "#666",
dashArray = "",
fillOpacity = 0.7,
bringToFront = TRUE),
label = ~WB_A3,
popup = ~paste("Country Code: ", WB_A3,
"<br>Country: ", country,
"<br>Proportion of population access to safely managed drinking water service: ", value_2019))%>%
addLegend(
position = "bottomright",
pal = colorNumeric(palette = colors, domain = c(0, 100)),
values = c(0, 20, 40, 60, 80, 100),
title = "Proportion (%)",
labels = c("0-20", "20-40", "40-60", "60-80", "80-100"),
opacity = 1
)
urban_map8.1.3 Rural
# Define the color palette function
colorPalette <- colorRampPalette(c("#FF6969", "#00A0D6"))
# Generate the color palette based on the number of breaks
numBreaks <- 6
colors <- colorPalette(numBreaks)
# Create a Leaflet map
rural_map <- leaflet(countries_sf_water_rural) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(fillColor = ~colorNumeric(palette=colors, domain = c(0, 20, 40, 60, 80, 100))(value_2019),
fillOpacity = 0.8,
weight = 2,
opacity = 1,
color = "white",
dashArray = "3",
highlight = highlightOptions(
weight = 5,
color = "#666",
dashArray = "",
fillOpacity = 0.7,
bringToFront = TRUE),
label = ~WB_A3,
popup = ~paste("Country Code: ", WB_A3,
"<br>Country: ", country,
"<br>Proportion of population access to safely managed drinking water service: ", value_2019))%>%
addLegend(
position = "bottomright",
pal = colorNumeric(palette = colors, domain = c(0, 100)),
values = c(0, 20, 40, 60, 80, 100),
title = "Proportion (%)",
labels = c("0-20", "20-40", "40-60", "60-80", "80-100"),
opacity = 1
)
rural_map8.2 Create map for mortality rate
# Define the color palette function
colorPalette <- colorRampPalette(c("#FFFFDF", "#FFEEAA", "#FFBB55", "#FF7700", "#FF4400"))
# Generate the color palette based on the number of breaks
numBreaks <- 6
colors <- colorPalette(numBreaks)
# Create a Leaflet map
mortality_mapping <- leaflet(countries_sf_mortality) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(fillColor = ~colorNumeric(palette=colors, domain = c(0, 20, 40, 60, 80, 100))(mortality_rate_unsafe_water),
fillOpacity = 0.8,
weight = 2,
opacity = 1,
color = "grey",
dashArray = "3",
highlight = highlightOptions(
weight = 5,
color = "#666",
dashArray = "",
fillOpacity = 0.7,
bringToFront = TRUE),
label = ~WB_A3,
popup = ~paste("Country Code: ", WB_A3,
"<br>Country: ", country,
"<br>Mortality Rate from Unsafe Sanitation and Water per 100,000: ", mortality_rate_unsafe_water))%>%
addLegend(
position = "bottomright",
pal = colorNumeric(palette = colors, domain = c(0, 100)),
values = c(0, 20, 40, 60, 80, 100),
title = "Proportion (100, 000)",
labels = c("0-20", "20-40", "40-60", "60-80", "80-100"),
opacity = 1
)Warning in colorNumeric(palette = colors, domain = c(0, 20, 40, 60, 80, : Some
values were outside the color scale and will be treated as NA
mortality_mapping# Plots of Indicator between mortality rate due to unsafe drinking water and proportion of population using safely managed drinking water services.merged_data <- merge(mortality_rate_unsafe_water, proportion_of_safe_water, by=c('geoAreaCode', 'geoAreaName'))
merged_data <- merged_data[!is.na(merged_data$value_2019), ] #remove NA values for 2019
# Display the first few rows of the merged dataframe
head(merged_data) geoAreaCode geoAreaName ObjectId.x goal_code.x goal_labelEN.x
1 100 Bulgaria 113 3 Goal 3
2 104 Myanmar 7 3 Goal 3
3 104 Myanmar 7 3 Goal 3
4 104 Myanmar 7 3 Goal 3
5 112 Belarus 102 3 Goal 3
6 116 Cambodia 117 3 Goal 3
goal_descEN.x target_code.x
1 Ensure healthy lives and promote well-being for all at all ages 3.9
2 Ensure healthy lives and promote well-being for all at all ages 3.9
3 Ensure healthy lives and promote well-being for all at all ages 3.9
4 Ensure healthy lives and promote well-being for all at all ages 3.9
5 Ensure healthy lives and promote well-being for all at all ages 3.9
6 Ensure healthy lives and promote well-being for all at all ages 3.9
target_descEN.x
1 By 2030, substantially reduce the number of deaths and illnesses from hazardous chemicals and air, water and soil pollution and contamination
2 By 2030, substantially reduce the number of deaths and illnesses from hazardous chemicals and air, water and soil pollution and contamination
3 By 2030, substantially reduce the number of deaths and illnesses from hazardous chemicals and air, water and soil pollution and contamination
4 By 2030, substantially reduce the number of deaths and illnesses from hazardous chemicals and air, water and soil pollution and contamination
5 By 2030, substantially reduce the number of deaths and illnesses from hazardous chemicals and air, water and soil pollution and contamination
6 By 2030, substantially reduce the number of deaths and illnesses from hazardous chemicals and air, water and soil pollution and contamination
indicator_code.x indicator_reference.x
1 C030902 3.9.2
2 C030902 3.9.2
3 C030902 3.9.2
4 C030902 3.9.2
5 C030902 3.9.2
6 C030902 3.9.2
indicator_descEN.x
1 Mortality rate attributed to unsafe water, unsafe sanitation and lack of hygiene (exposure to unsafe Water, Sanitation and Hygiene for All (WASH) services)
2 Mortality rate attributed to unsafe water, unsafe sanitation and lack of hygiene (exposure to unsafe Water, Sanitation and Hygiene for All (WASH) services)
3 Mortality rate attributed to unsafe water, unsafe sanitation and lack of hygiene (exposure to unsafe Water, Sanitation and Hygiene for All (WASH) services)
4 Mortality rate attributed to unsafe water, unsafe sanitation and lack of hygiene (exposure to unsafe Water, Sanitation and Hygiene for All (WASH) services)
5 Mortality rate attributed to unsafe water, unsafe sanitation and lack of hygiene (exposure to unsafe Water, Sanitation and Hygiene for All (WASH) services)
6 Mortality rate attributed to unsafe water, unsafe sanitation and lack of hygiene (exposure to unsafe Water, Sanitation and Hygiene for All (WASH) services)
series_release.x
1 2021.Q2.G.03
2 2021.Q2.G.03
3 2021.Q2.G.03
4 2021.Q2.G.03
5 2021.Q2.G.03
6 2021.Q2.G.03
series_tags.x
1 ['hygiene', 'health', 'mortality', 'death', 'water-related diseases', 'sanitation', 'water', 'drinking water']
2 ['hygiene', 'health', 'mortality', 'death', 'water-related diseases', 'sanitation', 'water', 'drinking water']
3 ['hygiene', 'health', 'mortality', 'death', 'water-related diseases', 'sanitation', 'water', 'drinking water']
4 ['hygiene', 'health', 'mortality', 'death', 'water-related diseases', 'sanitation', 'water', 'drinking water']
5 ['hygiene', 'health', 'mortality', 'death', 'water-related diseases', 'sanitation', 'water', 'drinking water']
6 ['hygiene', 'health', 'mortality', 'death', 'water-related diseases', 'sanitation', 'water', 'drinking water']
series.x
1 SH_STA_WASH
2 SH_STA_WASH
3 SH_STA_WASH
4 SH_STA_WASH
5 SH_STA_WASH
6 SH_STA_WASH
seriesDescription.x
1 Mortality rate attributed to unsafe water, unsafe sanitation and lack of hygiene (deaths per 100,000 population)
2 Mortality rate attributed to unsafe water, unsafe sanitation and lack of hygiene (deaths per 100,000 population)
3 Mortality rate attributed to unsafe water, unsafe sanitation and lack of hygiene (deaths per 100,000 population)
4 Mortality rate attributed to unsafe water, unsafe sanitation and lack of hygiene (deaths per 100,000 population)
5 Mortality rate attributed to unsafe water, unsafe sanitation and lack of hygiene (deaths per 100,000 population)
6 Mortality rate attributed to unsafe water, unsafe sanitation and lack of hygiene (deaths per 100,000 population)
level_.x parentCode.x parentName.x type.x X.x Y.x ISO3.x
1 5 151 Eastern Europe Country 25.23763 42.75731 BGR
2 4 35 South-Eastern Asia Country 96.51752 21.19333 MMR
3 4 35 South-Eastern Asia Country 96.51752 21.19333 MMR
4 4 35 South-Eastern Asia Country 96.51752 21.19333 MMR
5 5 151 Eastern Europe Country 28.04940 53.54193 BLR
6 4 35 South-Eastern Asia Country 104.92284 12.71164 KHM
UN_Member.x Country_Profile.x UnitMultiplier.x Units_code.x
1 1 1 NA PER_100000_POP
2 1 1 NA PER_100000_POP
3 1 1 NA PER_100000_POP
4 1 1 NA PER_100000_POP
5 1 1 NA PER_100000_POP
6 1 1 NA PER_100000_POP
Units_desc.x timeSeries_id.x timeSeries_keys.x n_years.x min_year.x
1 Per 100,000 population SH_STA_WASH NA 1 2016
2 Per 100,000 population SH_STA_WASH NA 1 2016
3 Per 100,000 population SH_STA_WASH NA 1 2016
4 Per 100,000 population SH_STA_WASH NA 1 2016
5 Per 100,000 population SH_STA_WASH NA 1 2016
6 Per 100,000 population SH_STA_WASH NA 1 2016
max_year.x years.x value_2016.x latest_value.x footnotes.x nature.x
1 2016 [2016] 0.1 0.1 NA E: Estimated data
2 2016 [2016] 12.6 12.6 NA E: Estimated data
3 2016 [2016] 12.6 12.6 NA E: Estimated data
4 2016 [2016] 12.6 12.6 NA E: Estimated data
5 2016 [2016] 0.1 0.1 NA E: Estimated data
6 2016 [2016] 6.5 6.5 NA E: Estimated data
ObjectId.y goal_code.y goal_labelEN.y
1 263 6 Goal 6
2 204 6 Goal 6
3 203 6 Goal 6
4 202 6 Goal 6
5 250 6 Goal 6
6 264 6 Goal 6
goal_descEN.y
1 Ensure availability and sustainable management of water and sanitation for all
2 Ensure availability and sustainable management of water and sanitation for all
3 Ensure availability and sustainable management of water and sanitation for all
4 Ensure availability and sustainable management of water and sanitation for all
5 Ensure availability and sustainable management of water and sanitation for all
6 Ensure availability and sustainable management of water and sanitation for all
target_code.y
1 6.1
2 6.1
3 6.1
4 6.1
5 6.1
6 6.1
target_descEN.y
1 By 2030, achieve universal and equitable access to safe and affordable drinking water for all
2 By 2030, achieve universal and equitable access to safe and affordable drinking water for all
3 By 2030, achieve universal and equitable access to safe and affordable drinking water for all
4 By 2030, achieve universal and equitable access to safe and affordable drinking water for all
5 By 2030, achieve universal and equitable access to safe and affordable drinking water for all
6 By 2030, achieve universal and equitable access to safe and affordable drinking water for all
indicator_code.y indicator_reference.y
1 C060101 6.1.1
2 C060101 6.1.1
3 C060101 6.1.1
4 C060101 6.1.1
5 C060101 6.1.1
6 C060101 6.1.1
indicator_descEN.y
1 Proportion of population using safely managed drinking water services
2 Proportion of population using safely managed drinking water services
3 Proportion of population using safely managed drinking water services
4 Proportion of population using safely managed drinking water services
5 Proportion of population using safely managed drinking water services
6 Proportion of population using safely managed drinking water services
series_release.y series_tags.y series.y
1 2021.Q2.G.03 ['water', 'drinking water'] SH_H2O_SAFE
2 2021.Q2.G.03 ['water', 'drinking water'] SH_H2O_SAFE
3 2021.Q2.G.03 ['water', 'drinking water'] SH_H2O_SAFE
4 2021.Q2.G.03 ['water', 'drinking water'] SH_H2O_SAFE
5 2021.Q2.G.03 ['water', 'drinking water'] SH_H2O_SAFE
6 2021.Q2.G.03 ['water', 'drinking water'] SH_H2O_SAFE
seriesDescription.y
1 Proportion of population using safely managed drinking water services, by urban/rural (%)
2 Proportion of population using safely managed drinking water services, by urban/rural (%)
3 Proportion of population using safely managed drinking water services, by urban/rural (%)
4 Proportion of population using safely managed drinking water services, by urban/rural (%)
5 Proportion of population using safely managed drinking water services, by urban/rural (%)
6 Proportion of population using safely managed drinking water services, by urban/rural (%)
level_.y parentCode.y parentName.y type.y X.y Y.y ISO3.y
1 5 151 Eastern Europe Country 25.23763 42.75731 BGR
2 4 35 South-Eastern Asia Country 96.51752 21.19333 MMR
3 4 35 South-Eastern Asia Country 96.51752 21.19333 MMR
4 4 35 South-Eastern Asia Country 96.51752 21.19333 MMR
5 5 151 Eastern Europe Country 28.04940 53.54193 BLR
6 4 35 South-Eastern Asia Country 104.92284 12.71164 KHM
UN_Member.y Country_Profile.y location_code location_desc UnitMultiplier.y
1 1 1 _T All areas NA
2 1 1 U Urban NA
3 1 1 R Rural NA
4 1 1 _T All areas NA
5 1 1 _T All areas NA
6 1 1 _T All areas NA
Units_code.y Units_desc.y timeSeries_id.y timeSeries_keys.y n_years.y
1 PT Percentage SH_H2O_SAFE___T location 21
2 PT Percentage SH_H2O_SAFE__U location 21
3 PT Percentage SH_H2O_SAFE__R location 21
4 PT Percentage SH_H2O_SAFE___T location 21
5 PT Percentage SH_H2O_SAFE___T location 21
6 PT Percentage SH_H2O_SAFE___T location 21
min_year.y max_year.y
1 2000 2020
2 2000 2020
3 2000 2020
4 2000 2020
5 2000 2020
6 2000 2020
years.y
1 [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]
2 [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]
3 [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]
4 [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]
5 [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]
6 [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]
value_2000 value_2001 value_2002 value_2003 value_2004 value_2005 value_2006
1 90 90 91 91 92 92 93
2 51 52 54 56 58 59 61
3 18 20 21 23 24 26 27
4 27 29 30 32 33 35 37
5 81 81 81 81 83 84 85
6 17 17 18 18 19 19 20
value_2007 value_2008 value_2009 value_2010 value_2011 value_2012 value_2013
1 93 94 94 95 95 96 96
2 63 65 67 68 70 70 71
3 29 30 32 34 35 37 39
4 38 40 42 44 45 47 48
5 86 88 89 90 91 92 93
6 20 21 21 22 23 23 24
value_2014 value_2015 value_2016.y value_2017 value_2018 value_2019
1 97 97 97 97 97 98
2 71 71 72 72 73 73
3 41 43 45 47 49 51
4 50 51 53 54 56 58
5 94 94 95 95 95 95
6 24 25 25 26 27 27
value_2020 latest_value.y footnotes.y nature.y
1 98 98 NA E: Estimated data
2 74 74 NA E: Estimated data
3 52 52 NA E: Estimated data
4 59 59 NA E: Estimated data
5 95 95 NA E: Estimated data
6 28 28 NA E: Estimated data
8.3 Correlation
# Plot the data
plot_loess <- ggplot(merged_data, aes(x = latest_value.x, y = value_2019)) +
geom_point(aes(color = location_desc,
shape = location_desc,
text = paste("Country:", geoAreaName, "<br>", "Location:", location_desc)), alpha = 0.7) +
geom_smooth(data = subset(merged_data, location_desc == "All areas"),
method = loess, se = FALSE,
aes(color = location_desc, weight = value_2019),
show.legend = TRUE) +
geom_smooth(data = subset(merged_data, location_desc == "Urban"),
method = loess,
se = FALSE,
aes(color = location_desc, weight = value_2019),
show.legend = TRUE) +
geom_smooth(data = subset(merged_data, location_desc == "Rural"),
method = loess,
se = FALSE,
aes(color = location_desc, weight = value_2019),
show.legend = TRUE) +
labs(x = "Mortality Rate (deaths per 100,000 population)",
y = "Safely Managed Drinking Water Services (%)",
title = "Correlation between Mortality Rate and Water Services (2019)",
color = "Location Type",
shape = "Location Type") +
theme_minimal() +
guides(color = guide_legend(override.aes = list(shape = 16)),
shape = guide_legend(override.aes = list(color = "black")))Warning in geom_point(aes(color = location_desc, shape = location_desc, :
Ignoring unknown aesthetics: text
# Convert to a plotly plot
gp <- ggplotly(plot_loess, tooltip = "text")`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
# Print the plot
gpcorrelation_all <- cor(merged_data$latest_value.x[merged_data$location_desc == "All areas"],
merged_data$value_2019[merged_data$location_desc == "All areas"])
correlation_urban <- cor(merged_data$latest_value.x[merged_data$location_desc == "Urban"],
merged_data$value_2019[merged_data$location_desc == "Urban"])
correlation_rural <- cor(merged_data$latest_value.x[merged_data$location_desc == "Rural"],
merged_data$value_2019[merged_data$location_desc == "Rural"])
blank_plot <- ggplot() +
theme_void() +
theme(plot.margin = margin(1, 1, 1, 1, "cm")) +
annotate("text",
x = 0.5,
y = 0.5,
label = paste("Correlation All:",
round(correlation_all, 2), "\n",
"Correlation Urban:",
round(correlation_urban, 2), "\n",
"Correlation Rural:",
round(correlation_rural, 2)),
size = 5,
color = "black",
hjust = 0.5)
# Convert to a plotly plot
gp_text <- ggplotly(blank_plot)
gp_text8.4 Barchart and stacked barchart (without Transformation)
stacked_bar_chart <- ggplot(merged_data, aes(x = latest_value.x, fill = location_desc)) +
geom_histogram(position = "stack", bins = 20) +
labs(x = "Mortality Rate (deaths per 100,000 population)",
y = "Count",
title = "Stacked Bar Chart of Mortality Rates") +
theme_minimal()
#change to plotly
ggplot_stacked_bar_chart <- ggplotly(stacked_bar_chart)
ggplot_stacked_bar_chartgrouped_bar_chart <- ggplot(merged_data, aes(x = latest_value.x, fill = location_desc)) +
geom_histogram(position = "dodge", bins = 20) +
labs(x = "Mortality Rate (deaths per 100,000 population)",
y = "Count",
title = "Grouped Bar Chart of Mortality Rates") +
theme_minimal()
# Convert to a plotly plot
ggplot_grouped_bar_chart <- ggplotly(grouped_bar_chart)
ggplot_grouped_bar_chart# Filter data for the histogram
merged_data_all<- subset(merged_data, location_desc == "All areas") # change to rural or urban if you want
histogram <- ggplot(merged_data_all, aes(x = latest_value.x,
text = paste("Country:", geoAreaName,
"<br>Mortality Rate:", latest_value.x))) +
geom_histogram(bins = 50, fill = 'grey', color = 'black') +
labs(x = 'Mortality Rate (deaths per 100,000 population)',
y = 'Count',
title = 'Histogram of Mortality Rates for "All" locations') +
theme_minimal()
# Convert ggplot histogram to a plotly plot
plotly_histogram <- ggplotly(histogram, tooltip = "text")8.5 Normalise Data
merged_data$latest_value.x_positive <- merged_data$latest_value.x + abs(min(merged_data$latest_value.x)) + 0.1
# Apply the Box-Cox transformation
bc_result <- MASS::boxcox(merged_data$latest_value.x_positive ~ 1,
lambda = seq(-3,3,0.1))
# The optimal lambda value is the one that maximizes the log-likelihood
optimal_lambda <- bc_result$x[which.max(bc_result$y)]
# Transform the data using the optimal lambda value
if (optimal_lambda == 0) {
merged_data$latest_value.x_bc <- log(merged_data$latest_value.x_positive)
} else {
merged_data$latest_value.x_bc <- (merged_data$latest_value.x_positive^optimal_lambda - 1) / optimal_lambda
}
# Shift the transformed variable to be non-negative
min_value <- min(merged_data$latest_value.x_bc)
merged_data$latest_value.x_bc <- merged_data$latest_value.x_bc + abs(min_value) + 0.18.6 Histogram (Normalised)
# Round the transformed mortality rates to the nearest whole number
merged_data$latest_value.x_bc_rounded <- round(merged_data$latest_value.x_bc)
# Convert the latest_value.x_bc_rounded variable into a categorical variable
merged_data$latest_value.x_bc_cat <- cut(merged_data$latest_value.x_bc_rounded, breaks = 50)
# Create a stacked bar chart
stacked_bar_chart_bc <- ggplot(merged_data, aes(x = latest_value.x_bc_rounded, fill = location_desc)) +
geom_bar(position = "stack", bins =20) +
labs(x = "Transformed Mortality Rate",
y = "Count",
title = "Stacked Bar Chart of Transformed Mortality Rates") +
theme_minimal()Warning in geom_bar(position = "stack", bins = 20): Ignoring unknown
parameters: `bins`
ggplot_stacked_bar_chart_bc <- ggplotly(stacked_bar_chart_bc)
ggplot_stacked_bar_chart_bc# Create a grouped bar chart
grouped_bar_chart_bc <- ggplot(merged_data, aes(x = latest_value.x_bc_rounded, fill = location_desc)) +
geom_bar(position = "dodge") +
labs(x = "Transformed Mortality Rate",
y = "Count",
title = "Grouped Bar Chart of Transformed Mortality Rates") +
theme_minimal()
# Convert to a plotly plot
ggplot_grouped_bar_chart_bc <- ggplotly(grouped_bar_chart_bc)
ggplot_grouped_bar_chart_bc8.7 Scatter plot Normalised
plot_loess <- ggplot(merged_data, aes(x = latest_value.x_bc, y = value_2019)) +
geom_point(aes(color = location_desc,
shape = location_desc,
text = paste("Country:", geoAreaName, "<br>", "Location:", location_desc)), alpha = 0.7) +
geom_smooth(data = subset(merged_data, location_desc == "All areas"),
method = loess, se = FALSE,
aes(color = location_desc, weight = value_2019),
show.legend = TRUE) +
geom_smooth(data = subset(merged_data, location_desc == "Urban"),
method = loess,
se = FALSE,
aes(color = location_desc, weight = value_2019),
show.legend = TRUE) +
geom_smooth(data = subset(merged_data, location_desc == "Rural"),
method = loess,
se = FALSE,
aes(color = location_desc, weight = value_2019),
show.legend = TRUE) +
labs(x = "Transformed Mortality Rate (deaths per 100,000 population)",
y = "Safely Managed Drinking Water Services (%)",
title = "Correlation between Transformed Mortality Rate and Water Services (Normalised)",
color = "Location Type",
shape = "Location Type") +
theme_minimal() +
guides(color = guide_legend(override.aes = list(shape = 16)),
shape = guide_legend(override.aes = list(color = "black")))Warning in geom_point(aes(color = location_desc, shape = location_desc, :
Ignoring unknown aesthetics: text
gp_normalised <- ggplotly(plot_loess, tooltip = "text")`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
gp_normalised8.8 Correlation normalised
correlation_all <- cor(merged_data$latest_value.x_bc[merged_data$location_desc == "All areas"],
merged_data$value_2019[merged_data$location_desc == "All areas"])
correlation_urban <- cor(merged_data$latest_value.x_bc[merged_data$location_desc == "Urban"],
merged_data$value_2019[merged_data$location_desc == "Urban"])
correlation_rural <- cor(merged_data$latest_value.x_bc[merged_data$location_desc == "Rural"],
merged_data$value_2019[merged_data$location_desc == "Rural"])
blank_plot <- ggplot() +
theme_void() +
theme(plot.margin = margin(1, 1, 1, 1, "cm")) +
annotate("text",
x = 0.5,
y = 0.5,
label = paste("Correlation All:",
round(correlation_all, 2), "\n",
"Correlation Urban:",
round(correlation_urban, 2), "\n",
"Correlation Rural:",
round(correlation_rural, 2)),
size = 5,
color = "black",
hjust = 0.5)
gp_text_normalised <- ggplotly(blank_plot)
gp_text_normalised8.9 Combining normal and unormalised plots
combined_plot <- subplot(
gp, gp_text,
gp_normalised, gp_text_normalised,
nrows = 2, margin = 0.05
)
# Print the combined plot
combined_plot9 Four Indicators
#mortality_rate_attributed_to_unsafe_water
mortality_rate_unsafe_water_indicators <- read_csv("data/indicator_3.9.2.csv",show_col_types = FALSE)
#Proportion_of_population_using_safely_managed_drinking_water_services
proportion_of_safe_water_indicators<- read_csv("data/indicator_6.1.1.csv",show_col_types = FALSE)
#mortality_rate_unintentional_poisoning
mortality_rate_unintentional_poisoning_indicators <- read_csv("data/indicator_3.9.3.csv",show_col_types = FALSE)
#proportion-of-population-with-basic-handwashing-facilities-on-premises-by-urban-rural-percent
population_with_basic_handwashing_facilities_indicators<- read_csv("data/indicator_6.2.1.csv",show_col_types = FALSE)mortality_rate_unsafe_water_indicators <- dplyr::select(
mortality_rate_unsafe_water_indicators,
code = 'geoAreaCode',
country = 'geoAreaName',
region = 'parentName',
iso = 'ISO3',
MR_unsafe_water = 'latest_value'
)
mortality_rate_unsafe_water_indicators <- mortality_rate_unsafe_water_indicators %>%
distinct()
mortality_rate_unsafe_water_indicators# A tibble: 183 × 5
code country region iso MR_unsafe_water
<dbl> <chr> <chr> <chr> <dbl>
1 583 Micronesia (Federated States of) Oceania (exc. A… FSM 3.6
2 496 Mongolia Eastern Asia MNG 1.3
3 499 Montenegro Southern Europe MNE 0
4 504 Morocco Northern Africa MAR 1.9
5 508 Mozambique Eastern Africa MOZ 27.6
6 104 Myanmar South-Eastern A… MMR 12.6
7 516 Namibia Southern Africa NAM 18.3
8 524 Nepal Southern Asia (… NPL 19.8
9 528 Netherlands Western Europe NLD 0.2
10 554 New Zealand Australia and N… NZL 0.1
# ℹ 173 more rows
proportion_of_safe_water_indicators <- dplyr::select(
proportion_of_safe_water_indicators,
code = 'geoAreaCode',
country = 'geoAreaName',
region = 'parentName',
iso = 'ISO3',
urbanisation = "location_desc",
sanitation_access = "value_2019",
)
# Remove duplicates
proportion_of_safe_water_indicators <- proportion_of_safe_water_indicators%>%
distinct()
proportion_of_safe_water_indicators# A tibble: 286 × 6
code country region iso urbanisation sanitation_access
<dbl> <chr> <chr> <chr> <chr> <dbl>
1 807 North Macedonia South… MKD Rural 66
2 804 Ukraine Easte… UKR Urban 89
3 807 North Macedonia South… MKD Urban 85
4 826 United Kingdom of Great Br… North… GBR All areas 100
5 580 Northern Mariana Islands Ocean… MNP All areas 91
6 840 United States of America North… USA All areas 97
7 578 Norway North… NOR All areas 99
8 512 Oman Weste… OMN All areas 90
9 586 Pakistan South… PAK All areas 36
10 586 Pakistan South… PAK Rural 33
# ℹ 276 more rows
mortality_rate_unintentional_poisoning_indicators <- dplyr::select(
mortality_rate_unintentional_poisoning_indicators,
code = 'geoAreaCode',
country = 'geoAreaName',
region = 'parentName',
iso = 'ISO3',
gender = 'sex_desc',
MR_poisoning = 'value_2019'
)
# drop rows if contains at least one NA value
mortality_rate_unintentional_poisoning_indicators <- na.omit(mortality_rate_unintentional_poisoning_indicators)
# Remove duplicates
mortality_rate_unintentional_poisoning_indicators <- mortality_rate_unintentional_poisoning_indicators%>%
distinct()
#check for empty row
any(is.na(mortality_rate_unintentional_poisoning_indicators))[1] FALSE
mortality_rate_unintentional_poisoning_indicators# A tibble: 549 × 6
code country region iso gender MR_poisoning
<dbl> <chr> <chr> <chr> <chr> <dbl>
1 470 Malta Southern Europe MLT Both se… 0.1
2 152 Chile South America CHL Male 0.5
3 4 Afghanistan Southern Asia (excluding India) AFG Both se… 1
4 470 Malta Southern Europe MLT Female 0
5 156 China Eastern Asia CHN Both se… 1.8
6 470 Malta Southern Europe MLT Male 0.2
7 156 China Eastern Asia CHN Female 1.5
8 478 Mauritania Western Africa MRT Both se… 1.5
9 478 Mauritania Western Africa MRT Female 1.3
10 478 Mauritania Western Africa MRT Male 1.7
# ℹ 539 more rows
population_with_basic_handwashing_facilities_indicators <- dplyr::select(
population_with_basic_handwashing_facilities_indicators,
code = 'geoAreaCode',
country = 'geoAreaName',
region = 'parentName',
iso = 'ISO3',
urbanisation = 'location_desc',
handwash_access = 'value_2019'
)
# drop rows if contains at least one NA value
population_with_basic_handwashing_facilities_indicators <- na.omit(population_with_basic_handwashing_facilities_indicators)
# Remove duplicates
population_with_basic_handwashing_facilities_indicators <- population_with_basic_handwashing_facilities_indicators%>%
distinct()
#check for empty row
any(is.na(mortality_rate_unintentional_poisoning_indicators))[1] FALSE
population_with_basic_handwashing_facilities_indicators# A tibble: 250 × 6
code country region iso urbanisation handwash_access
<dbl> <chr> <chr> <chr> <chr> <dbl>
1 4 Afghanistan Southern Asia (excludi… AFG All areas 38
2 356 India Southern Asia IND Urban 82
3 360 Indonesia South-Eastern Asia IDN All areas 94
4 4 Afghanistan Southern Asia (excludi… AFG Rural 29
5 694 Sierra Leone Western Africa SLE Rural 18
6 360 Indonesia South-Eastern Asia IDN Rural 91
7 4 Afghanistan Southern Asia (excludi… AFG Urban 64
8 12 Algeria Northern Africa DZA All areas 85
9 12 Algeria Northern Africa DZA Rural 75
10 12 Algeria Northern Africa DZA Urban 88
# ℹ 240 more rows
9.1 Joining all 4 indicators
countries_tb <- left_join(
mortality_rate_unintentional_poisoning_indicators,
population_with_basic_handwashing_facilities_indicators,
by = join_by(country, iso)
)
countries_tb <- left_join(
countries_tb,
mortality_rate_unsafe_water_indicators,
by = join_by(iso)
)
countries_tb <- left_join(
countries_tb,
proportion_of_safe_water_indicators,
by = join_by(iso, urbanisation)
) |>
mutate(
iso = case_match(
iso,
"COD" ~ "ZAR",
"ROU" ~ "ROM",
"TLS" ~ "TMP",
"XKX" ~ "KSV",
.default = iso
)
)
names(countries_tb) [1] "code.x" "country.x" "region.x"
[4] "iso" "gender" "MR_poisoning"
[7] "code.y" "region.y" "urbanisation"
[10] "handwash_access" "code.x.x" "country.y"
[13] "region.x.x" "MR_unsafe_water" "code.y.y"
[16] "country" "region.y.y" "sanitation_access"
9.2 Joining with adminGeoJson
countries_sf <- read_sf("./data/WB_countries_Admin0.geojson")
countries_tb <- left_join(
countries_tb,
dplyr::select(countries_sf,WB_A3, GDP_MD_EST, INCOME_GRP, POP_EST),
by = c("iso" = "WB_A3")
)Warning in left_join(countries_tb, dplyr::select(countries_sf, WB_A3, GDP_MD_EST, : Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 325 of `x` matches multiple rows in `y`.
ℹ Row 126 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
"many-to-many"` to silence this warning.
countries_tb# A tibble: 1,038 × 22
code.x country.x region.x iso gender MR_poisoning code.y region.y
<dbl> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr>
1 470 Malta Southern Europe MLT Both … 0.1 NA <NA>
2 152 Chile South America CHL Male 0.5 NA <NA>
3 4 Afghanistan Southern Asia (… AFG Both … 1 4 Souther…
4 4 Afghanistan Southern Asia (… AFG Both … 1 4 Souther…
5 4 Afghanistan Southern Asia (… AFG Both … 1 4 Souther…
6 470 Malta Southern Europe MLT Female 0 NA <NA>
7 156 China Eastern Asia CHN Both … 1.8 NA <NA>
8 470 Malta Southern Europe MLT Male 0.2 NA <NA>
9 156 China Eastern Asia CHN Female 1.5 NA <NA>
10 478 Mauritania Western Africa MRT Both … 1.5 478 Western…
# ℹ 1,028 more rows
# ℹ 14 more variables: urbanisation <chr>, handwash_access <dbl>,
# code.x.x <dbl>, country.y <chr>, region.x.x <chr>, MR_unsafe_water <dbl>,
# code.y.y <dbl>, country <chr>, region.y.y <chr>, sanitation_access <dbl>,
# GDP_MD_EST <dbl>, INCOME_GRP <chr>, POP_EST <int>,
# geometry <MULTIPOLYGON [°]>
9.3 Clean data
Remove certain columns and rename column for cleaner a look
# Drop
clean_datatable <- dplyr::select(countries_tb,
-code.y, -code.x,
-region.y, -region.x,
-code.x.x, code.y.y,
-country.x, -country.y,
-region.x.x, -region.y.y,
-iso)
# Rename
clean_datatable <- clean_datatable %>% rename(
gdp = GDP_MD_EST,
income_group = INCOME_GRP,
population = POP_EST)
# Rearrange
clean_datatable <- clean_datatable %>%
dplyr::select(country,
gdp,
income_group,
population,
urbanisation,
gender,
handwash_access,
sanitation_access,
MR_poisoning,
MR_unsafe_water,)
datatable(clean_datatable)9.4 Correlation between 4 indicators
# filter gender of both sexes to reduce duplicates
countries_tb <- countries_tb |>
filter(gender == "Both sexes") |>
na.omit(countries_tb)
countries_tb# A tibble: 143 × 22
code.x country.x region.x iso gender MR_poisoning code.y region.y
<dbl> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr>
1 4 Afghanistan Southern Asia (… AFG Both … 1 4 Souther…
2 4 Afghanistan Southern Asia (… AFG Both … 1 4 Souther…
3 4 Afghanistan Southern Asia (… AFG Both … 1 4 Souther…
4 484 Mexico Central America MEX Both … 0.4 484 Central…
5 496 Mongolia Eastern Asia MNG Both … 2.8 496 Eastern…
6 496 Mongolia Eastern Asia MNG Both … 2.8 496 Eastern…
7 496 Mongolia Eastern Asia MNG Both … 2.8 496 Eastern…
8 499 Montenegro Southern Europe MNE Both … 0.6 499 Souther…
9 499 Montenegro Southern Europe MNE Both … 0.6 499 Souther…
10 104 Myanmar South-Eastern A… MMR Both … 1.3 104 South-E…
# ℹ 133 more rows
# ℹ 14 more variables: urbanisation <chr>, handwash_access <dbl>,
# code.x.x <dbl>, country.y <chr>, region.x.x <chr>, MR_unsafe_water <dbl>,
# code.y.y <dbl>, country <chr>, region.y.y <chr>, sanitation_access <dbl>,
# GDP_MD_EST <dbl>, INCOME_GRP <chr>, POP_EST <int>,
# geometry <MULTIPOLYGON [°]>
countries_tb_subset <- dplyr::select(countries_tb, MR_poisoning, MR_unsafe_water, handwash_access, sanitation_access,urbanisation,country.x)
# Display the subsetted data
print(countries_tb_subset)# A tibble: 143 × 6
MR_poisoning MR_unsafe_water handwash_access sanitation_access urbanisation
<dbl> <dbl> <dbl> <dbl> <chr>
1 1 13.9 38 27 All areas
2 1 13.9 29 24 Rural
3 1 13.9 64 36 Urban
4 0.4 1.1 90 43 All areas
5 2.8 1.3 84 30 All areas
6 2.8 1.3 77 11 Rural
7 2.8 1.3 88 38 Urban
8 0.6 0 99 85 All areas
9 0.6 0 99 87 Urban
10 1.3 12.6 74 58 All areas
# ℹ 133 more rows
# ℹ 1 more variable: country.x <chr>
9.5 Scatterplot matrix
#set axis names
countries_tb_subset_corr4 <- dplyr::select(countries_tb_subset, MR_unsafe_water, MR_poisoning, handwash_access, sanitation_access, urbanisation,country.x)
colnames(countries_tb_subset_corr4) <- c("Unsafe Water Mortality Rate",
"Poisoning Mortality",
"Handwashing Access (%)",
"Sanitation Access (%)",
"Urbanisation",
"Countries"
)
countries_tb_subset_corr4# A tibble: 143 × 6
`Unsafe Water Mortality Rate` `Poisoning Mortality` `Handwashing Access (%)`
<dbl> <dbl> <dbl>
1 13.9 1 38
2 13.9 1 29
3 13.9 1 64
4 1.1 0.4 90
5 1.3 2.8 84
6 1.3 2.8 77
7 1.3 2.8 88
8 0 0.6 99
9 0 0.6 99
10 12.6 1.3 74
# ℹ 133 more rows
# ℹ 3 more variables: `Sanitation Access (%)` <dbl>, Urbanisation <chr>,
# Countries <chr>
#change to factor for colour
countries_tb_subset_corr4$Urbanisation <- as.factor(countries_tb_subset_corr4$Urbanisation)
plot <- ggpairs(data = countries_tb_subset_corr4,
columns = 1:4,
upper = list(continuous = "cor"),
lower = list(continuous = "smooth", se=FALSE),
diag = list(continuous = "bar", bin=30),
mapping = aes(color = Urbanisation),
tooltips = c("Countries")) +
theme_minimal() +
theme(axis.title = element_text(size = 5))Warning in warn_if_args_exist(list(...)): Extra arguments: "tooltips" are being
ignored. If these are meant to be aesthetics, submit them using the 'mapping'
variable within ggpairs with ggplot2::aes or ggplot2::aes_string.
Warning in check_and_set_ggpairs_defaults("diag", diag, continuous =
"densityDiag", : Changing diag$continuous from 'bar' to 'barDiag'
plot`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Convert ggplot to plotly
plot_interactive <- ggplotly(plot)`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Can only have one: highlight
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Can only have one: highlight
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Can only have one: highlight
plot_interactive <- layout(plot_interactive,
font = list(size = 8)
)
# Render the plotly scatterplot matrix
plot_interactive9.6 Different Scatter plot matrix
countries_tb_subset_corr4 <- dplyr::select(countries_tb_subset, MR_unsafe_water, MR_poisoning, handwash_access, sanitation_access, urbanisation)
# Rename the columns
colnames(countries_tb_subset_corr4) <- c("Unsafe Water Mortality Rate",
"Poisoning Mortality",
"Handwashing Access (%)",
"Sanitation Access (%)",
"Urbanisation")
# Make urbanisation as a factor for colourisation
countries_tb_subset_corr4$Urbanisation <- as.factor(countries_tb_subset_corr4$Urbanisation)
# Map urbanisation to colors
color_mapping <- c("All areas" = "red", "Urban" = "blue", "Rural" = "green") # Adjust to your actual factor levels and desired colors
colors <- color_mapping[countries_tb_subset_corr4$Urbanisation]
# Create scatterplot matrix without colors
plot <- pairs.panels(countries_tb_subset_corr4[, 1:4],
hist.col = "#00AFBB",
bg = "transparent")
10 K-means clustering
10.1 Scaling the dataset
df <- clean_datatable |>
dplyr::filter(urbanisation == "All areas", gender == "Both sexes") |>
dplyr::select(-urbanisation,-gender,-gdp) |>
na.omit()
kable(head((df)))| country | income_group | population | handwash_access | sanitation_access | MR_poisoning | MR_unsafe_water |
|---|---|---|---|---|---|---|
| Afghanistan | 5. Low income | 34124811 | 38 | 27 | 1.0 | 13.9 |
| Mexico | 3. Upper middle income | 124574795 | 90 | 43 | 0.4 | 1.1 |
| Mongolia | 4. Lower middle income | 3068243 | 84 | 30 | 2.8 | 1.3 |
| Montenegro | 3. Upper middle income | 642550 | 99 | 85 | 0.6 | 0.0 |
| Myanmar | 5. Low income | 55123814 | 74 | 58 | 1.3 | 12.6 |
| Nepal | 5. Low income | 29384297 | 61 | 19 | 1.7 | 19.8 |
numeric_cols <-
c("handwash_access",
"sanitation_access",
"MR_poisoning",
"MR_unsafe_water")
numeric_df <- df[numeric_cols]
scaled_df <- scale(numeric_df)
kable(head((scaled_df)))| handwash_access | sanitation_access | MR_poisoning | MR_unsafe_water |
|---|---|---|---|
| -0.7196189 | -0.6869441 | -0.3505204 | -0.2152268 |
| 0.9599226 | -0.0631583 | -0.8736852 | -0.7322881 |
| 0.7661293 | -0.5699842 | 1.2189739 | -0.7242090 |
| 1.2506124 | 1.5742793 | -0.6992969 | -0.7767231 |
| 0.4431406 | 0.5216408 | -0.0889380 | -0.2677408 |
| 0.0232552 | -0.9988370 | 0.2598385 | 0.0231062 |
10.2 Elbow Criterion
fviz_nbclust(scaled_df, kmeans, method = "wss")
** Elbow at 3, so we choose k=3 **
10.3 Do k-means clustering for dataset
scaled_df_table <- data.table(scaled_df)
k <- kmeans(scaled_df_table, centers = 3)
scaled_df_table[, cluster := as.factor(k$cluster)]
country <- df$country
income_group <- df$income_group
population <- df$population
scaled_df_table_with_country <-
cbind(population, income_group, country, scaled_df_table)
kable(head(scaled_df_table_with_country))| population | income_group | country | handwash_access | sanitation_access | MR_poisoning | MR_unsafe_water | cluster |
|---|---|---|---|---|---|---|---|
| 34124811 | 5. Low income | Afghanistan | -0.7196189 | -0.6869441 | -0.3505204 | -0.2152268 | 1 |
| 124574795 | 3. Upper middle income | Mexico | 0.9599226 | -0.0631583 | -0.8736852 | -0.7322881 | 1 |
| 3068243 | 4. Lower middle income | Mongolia | 0.7661293 | -0.5699842 | 1.2189739 | -0.7242090 | 1 |
| 642550 | 3. Upper middle income | Montenegro | 1.2506124 | 1.5742793 | -0.6992969 | -0.7767231 | 2 |
| 55123814 | 5. Low income | Myanmar | 0.4431406 | 0.5216408 | -0.0889380 | -0.2677408 | 1 |
| 29384297 | 5. Low income | Nepal | 0.0232552 | -0.9988370 | 0.2598385 | 0.0231062 | 1 |
11 Create pair plot highlighting clusters
ggpair_plt <- ggpairs(
scaled_df_table_with_country,
aes(colour = cluster, text = country),
diag = list(continuous = "barDiag"),
columns = c(
"handwash_access",
"sanitation_access",
"MR_poisoning",
"MR_unsafe_water"
),
columnLabels = c(
"% Hand-wash\n Facility",
"% Safe Drinking\n Water",
"% Death by\n Posion",
"% Death by\n Unsafe Water"
)
)ggplotly_plot <- ggplotly(ggpair_plt, tooltip = c("country"))`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Can only have one: highlight
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Can only have one: highlight
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Can only have one: highlight
ggplotly_plot
1. Red cluster = Low mortality rate
2. Green cluster = High sanitation and Low mortality rate
3. Blue cluster = Low sanitation, high mortality rate
12 Decision Tree
12.1 Add country code to dataset
scaled_df_table_with_cc <-
scaled_df_table_with_country |>
mutate(code = countrycode(country, origin = 'country.name', destination = 'wb'))
kable(head(scaled_df_table_with_cc))| population | income_group | country | handwash_access | sanitation_access | MR_poisoning | MR_unsafe_water | cluster | code |
|---|---|---|---|---|---|---|---|---|
| 34124811 | 5. Low income | Afghanistan | -0.7196189 | -0.6869441 | -0.3505204 | -0.2152268 | 1 | AFG |
| 124574795 | 3. Upper middle income | Mexico | 0.9599226 | -0.0631583 | -0.8736852 | -0.7322881 | 1 | MEX |
| 3068243 | 4. Lower middle income | Mongolia | 0.7661293 | -0.5699842 | 1.2189739 | -0.7242090 | 1 | MNG |
| 642550 | 3. Upper middle income | Montenegro | 1.2506124 | 1.5742793 | -0.6992969 | -0.7767231 | 2 | MNE |
| 55123814 | 5. Low income | Myanmar | 0.4431406 | 0.5216408 | -0.0889380 | -0.2677408 | 1 | MMR |
| 29384297 | 5. Low income | Nepal | 0.0232552 | -0.9988370 | 0.2598385 | 0.0231062 | 1 | NPL |
any_na <- any(is.na(scaled_df_table_with_cc))
any_na[1] FALSE
countries_sf <- read_sf("./data/WB_countries_Admin0.geojson")
countries_sf <- st_drop_geometry(countries_sf) |>
dplyr::select(WB_A3, REGION_WB)
dataset <-
merge(scaled_df_table_with_cc,
countries_sf,
by.x = "code",
by.y = "WB_A3")
kable(head(dataset))| code | population | income_group | country | handwash_access | sanitation_access | MR_poisoning | MR_unsafe_water | cluster | REGION_WB |
|---|---|---|---|---|---|---|---|---|---|
| AFG | 34124811 | 5. Low income | Afghanistan | -0.7196189 | -0.6869441 | -0.3505204 | -0.2152268 | 1 | South Asia |
| ARM | 3045191 | 4. Lower middle income | Armenia | 1.1214169 | 1.6132659 | -0.6121028 | -0.7686440 | 2 | Europe & Central Asia |
| BGD | 157826578 | 5. Low income | Bangladesh | -0.1705381 | 0.5216408 | -0.9608793 | -0.2960176 | 1 | South Asia |
| BTN | 758288 | 4. Lower middle income | Bhutan | 0.9922214 | -0.3360646 | -1.0480735 | -0.6191809 | 1 | South Asia |
| CAF | 5625118 | 5. Low income | Central African Republic | -1.2686998 | -1.5056629 | 1.2189739 | 2.5397406 | 3 | Sub-Saharan Africa |
| CIV | 24184810 | 4. Lower middle income | Côte d’Ivoire | -1.2686998 | -0.3750512 | 0.9573916 | 1.1299406 | 3 | Sub-Saharan Africa |
any_na <- any(is.na(dataset))
any_na[1] FALSE
dplyr::glimpse(dataset)Rows: 49
Columns: 10
$ code <chr> "AFG", "ARM", "BGD", "BTN", "CAF", "CIV", "COG", "CO…
$ population <int> 34124811, 3045191, 157826578, 758288, 5625118, 24184…
$ income_group <chr> "5. Low income", "4. Lower middle income", "5. Low i…
$ country <chr> "Afghanistan", "Armenia", "Bangladesh", "Bhutan", "C…
$ handwash_access <dbl> -0.7196189, 1.1214169, -0.1705381, 0.9922214, -1.268…
$ sanitation_access <dbl> -0.68694407, 1.61326593, 0.52164085, -0.33606458, -1…
$ MR_poisoning <dbl> -0.35052041, -0.61210280, -0.96087932, -1.04807345, …
$ MR_unsafe_water <dbl> -0.21522678, -0.76864397, -0.29601761, -0.61918093, …
$ cluster <fct> 1, 2, 1, 1, 3, 3, 1, 2, 2, 2, 2, 3, 2, 1, 3, 3, 1, 2…
$ REGION_WB <chr> "South Asia", "Europe & Central Asia", "South Asia",…
12.2 Create factor vairables for dataset
shuffle_index <- sample(1:nrow(dataset))
dataset <- dataset[shuffle_index,]
column_types <- sapply(dataset, class)
print(column_types) code population income_group country
"character" "integer" "character" "character"
handwash_access sanitation_access MR_poisoning MR_unsafe_water
"numeric" "numeric" "numeric" "numeric"
cluster REGION_WB
"factor" "character"
unique(dataset$REGION_WB)[1] "Sub-Saharan Africa" "Latin America & Caribbean"
[3] "East Asia & Pacific" "Europe & Central Asia"
[5] "South Asia" "Middle East & North Africa"
unique(dataset$income_group)[1] "5. Low income" "3. Upper middle income"
[3] "4. Lower middle income" "2. High income: nonOECD"
setnames(dataset, "REGION_WB", "region")
dataset <- dataset[, c("cluster", "region", "income_group", "population")]
dataset$income_group <- factor(
dataset$income_group,
levels = c(
'5. Low income',
'4. Lower middle income',
'3. Upper middle income',
'2. High income: nonOECD'
),
labels = c('low', 'lower middle', 'upper middle', 'high')
)
dataset$region <- factor(
dataset$region,
levels = c(
'Sub-Saharan Africa',
'South Asia',
'Middle East & North Africa',
'Europe & Central Asia',
'East Asia & Pacific',
'Latin America & Caribbean'
),
labels = c(
'Sub-Saharan Africa',
'South Asia',
'Middle East & North Africa',
'Europe & Central Asia',
'East Asia & Pacific',
'Latin America & Caribbean'
)
)
any_na <- any(is.na(dataset))
any_na[1] FALSE
clean_dataset <- dataset
dplyr::glimpse(clean_dataset)Rows: 49
Columns: 4
$ cluster <fct> 3, 1, 3, 3, 1, 3, 1, 2, 3, 3, 1, 3, 1, 1, 1, 2, 3, 1, 3, …
$ region <fct> Sub-Saharan Africa, Latin America & Caribbean, Sub-Sahara…
$ income_group <fct> low, upper middle, lower middle, low, lower middle, lower…
$ population <int> 39570125, 124574795, 190632261, 1792338, 3068243, 2418481…
12.3 Train decision tree
train_rows <-
createDataPartition(clean_dataset$cluster, p = 0.75, list = FALSE)
train <- clean_dataset[train_rows, ]
test <- clean_dataset[-train_rows, ]
# dim(train)
# dim(test)
# prop.table(table(train$cluster))
# prop.table(table(test$cluster))
tree_model <- rpart(cluster ~ ., data = train, method = "class")
split_fun <- function(x, labs, digits, varlen, faclen)
{
for (i in 1:length(labs)) {
labs[i] <- paste(strwrap(labs[i], width = 40), collapse = "\n")
}
labs
}
rpart.plot(tree_model,
extra = 102,
digits = -3,
split.fun = split_fun)
The decision tree only considered the region variable for classification.
13 Evaluation of decision tree
13.1 Relative importance
rel_impt <- as.data.frame(tree_model$variable.importance)
rel_impt tree_model$variable.importance
region 18.012821
income_group 6.492674
population 1.738095
Relative importance of the predictors for classification are as follows:
- Region: 18.01
- Income: 6.49
- Population size: 1.74
13.2 Prediction with decision tree
set.seed(12345)
prediction <- predict(tree_model, newdata = test, type = "class")
confMatrix <- table(prediction, test$cluster)
confMatrix
prediction 1 2 3
1 1 0 0
2 0 3 0
3 3 0 3
accuracy <- sum(diag(confMatrix)) / sum(confMatrix)
print(paste('Accuracy: ', accuracy))[1] "Accuracy: 0.7"
For a single train-test split of [75 25], the model obtained a decent prediction accuracy of 0.7.
13.3 Ten-fold cross validation
train_control <- trainControl(method = "cv", # Use cross validation
number = 10) # Use 10 folds
# tune_grid = expand.grid(cp=c(0.0001)) # complexity parameter (size of tree)
validated_tree <- train(cluster ~ .,
data = clean_dataset,
method = "rpart",
trControl = train_control)
validated_tree CART
49 samples
3 predictor
3 classes: '1', '2', '3'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 44, 44, 45, 45, 44, 43, ...
Resampling results across tuning parameters:
cp Accuracy Kappa
0.1000000 0.5750000 0.37451614
0.1333333 0.4950000 0.25439230
0.2333333 0.3883333 0.01176471
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.1.
validated_tree$resample Accuracy Kappa Resample
1 0.40 0.2105263 Fold02
2 0.60 0.3333333 Fold01
3 0.50 0.2000000 Fold03
4 0.50 0.2500000 Fold06
5 0.60 0.4117647 Fold05
6 0.75 0.6363636 Fold04
7 0.80 0.6875000 Fold07
8 0.80 0.6875000 Fold10
9 0.40 0.1176471 Fold09
10 0.40 0.2105263 Fold08
However, the accuracy for ten-fold cross validation was low, at 0.58.
This is probably because the dataset is too small, and the model is overfitting. Overall, the model is unable to generalize to unseen data.